import requests
from fake_useragent import UserAgent
from lxml import etree
from random import randint
from time import sleep


def get_html(url):
    headers = {
        "User-Agent": UserAgent().random
    }
    # s随机睡眠3-10秒
    # sleep(randint(5,10))
    response = requests.get(url, headers=headers)
    response.encoding = 'utf-8'
    print(response.text)
    if response.status_code == 200:
        return response.text
    else:
        return None


def parse_index(html):
    e = etree.HTML(html)
    all_url = e.xpath('//div[@class="channel-detail movie-item-title"]/a/@href')
    return ['http://maoyan.com{}'.format(url) for url in all_url]


def parse_info(html):
    e = etree.HTML(html)
    name = e.xpath('//h1[@class="name"]/text()')[0]
    types = e.xpath('//li[@class="ellipsis"]/a[1]/text()')[0]
    # 去掉重复
    actors = e.xpath('//li[@class="celebrity actor"]/div[@class="info"]/a/text()')
    actors = format_actors(actors)
    return {
        "name": name,
        "type": types,
        "actor": actors
    }


def format_actors(actors):
    actor_set = set()
    for actor in actors:
        actor_set.add(actor.strip())
    return actor_set

def main():
    # index_url = 'https://maoyan.com/films'
    index_url = 'https://maoyan.com/films?catId=2&showType=3'
    html = get_html(index_url)
    movie_urls = parse_index(html)
    for url in movie_urls:
        movie_html = get_html(url)
        movie = parse_info(movie_html)
        print(movie)

    print(movie_urls)


if __name__ == '__main__':
    main()
